Source: Sally Wang@Columbia Systems Lab
we build this interactive dashboard to demonstrate our differentially private system research on data optimization. This dashboard aims to illustrate two key benefits of our design: 1) we can analyze statistical data without revealing sensitive information of key stakeholders; 2) we optimize privacy budget utilization via data block substitutions to obtain similar results requested by clients. In this interactive dashboard, you'll see a comparative analysis of differentially private and non-private reports regarding queries on global covid datasets.
This dashboard shows 5 different interactive plots of COVID-19.
# importing libraries
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import diffprivlib.tools as dp
import statistics
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import plotly.graph_objects as go
import seaborn as sns
import ipywidgets as widgets
# loading data
death_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
recovered_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
country_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')
# data cleaning
# renaming the df column names to lowercase
country_df.columns = map(str.lower, country_df.columns)
confirmed_df.columns = map(str.lower, confirmed_df.columns)
death_df.columns = map(str.lower, death_df.columns)
recovered_df.columns = map(str.lower, recovered_df.columns)
# changing province/state to state and country/region to country
confirmed_df = confirmed_df.rename(columns={'province/state': 'state', 'country/region': 'country'})
recovered_df = confirmed_df.rename(columns={'province/state': 'state', 'country/region': 'country'})
death_df = death_df.rename(columns={'province/state': 'state', 'country/region': 'country'})
country_df = country_df.rename(columns={'country_region': 'country'})
def private_mean(column, privacy_budget: float) -> int:
x = dp.mean(column, privacy_budget, bounds=(0,27746), dtype="float")
return x
def mortality_mean(column, privacy_budget: float) -> int:
x = dp.mean(column, privacy_budget, bounds=(0, 1.2), dtype="float")
return x
def dp_sum(column, epsilon: float)-> int:
x = dp.nansum(column, epsilon, bounds=(100, 91563979))
return x
# total number of confirmed, death and recovered cases
country_df['confirmed'] = country_df['confirmed'].replace(np.nan, 0)
country_df['deaths'] = country_df['deaths'].replace(np.nan, 0)
country_df['incident_rate'] = country_df['incident_rate'].replace(np.nan, 0)
country_df['mortality_rate'] = country_df['mortality_rate'].replace(np.nan, 0)
confirmed_total = dp_sum(country_df['confirmed'].to_numpy(), 1)
deaths_total = dp_sum(country_df['deaths'], 1)
incident_rate = private_mean(country_df['incident_rate'], 1)
mortality_rate = mortality_mean(country_df['mortality_rate'], 1)
As a start, you will view the raw statistics in non-private setting below. You can fill in the text box below to view countries with top number of cases.
# sorting the values by confirmed descednding order
# country_df.sort_values('confirmed', ascending= False).head(10).style.background_gradient(cmap='copper')
# sorting the values by confirmed descednding order
# country_df.sort_values('confirmed', ascending= False).head(10).style.background_gradient(cmap='copper')
fig = go.FigureWidget( layout=go.Layout() )
def highlight_col(x):
r = 'background-color: #f2493a'
y = 'background-color: #5d73fc'
g = 'background-color: #2bba0f'
a = 'background-color: #9dba0f'
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
df1.iloc[:, 4] = y
df1.iloc[:, 5] = r
df1.iloc[:, 8] = g
df1.iloc[:, 11] = a
return df1
def show_latest_cases(TOP):
TOP = int(TOP)
return country_df.sort_values('confirmed', ascending= False).head(TOP).style.apply(highlight_col, axis=None)
interact(show_latest_cases, TOP='10')
ipywLayout = widgets.Layout(border='solid 2px green')
ipywLayout.display='none' # uncomment this, run cell again - then the graph/figure disappears
widgets.VBox([fig], layout=ipywLayout)
Enter the name of your country (with first letter capital e.g., Italy) or "World" for the total cases:
def plot_cases_of_a_country(country):
labels = ['confirmed', 'deaths']
colors = ['blue', 'red']
mode_size = [6, 8]
line_size = [4, 5]
df_list = [confirmed_df, death_df]
fig = go.Figure();
for i, df in enumerate(df_list):
if country == 'World' or country == 'world':
x_data = np.array(list(df.iloc[:, 20:].columns))
y_data = np.sum(np.asarray(df.iloc[:,4:]),axis = 0)
else:
x_data = np.array(list(df.iloc[:, 20:].columns))
y_data = np.sum(np.asarray(df[df['country'] == country].iloc[:,20:]),axis = 0)
fig.add_trace(go.Scatter(x=x_data, y=y_data, mode='lines+markers',
name=labels[i],
line=dict(color=colors[i], width=line_size[i]),
connectgaps=True,
text = "Total " + str(labels[i]) +": "+ str(y_data[-1])
));
fig.update_layout(
title="Cases of " + country,
xaxis_title='Date',
yaxis_title='Number of Confirmed Cases',
margin=dict(l=20, r=20, t=40, b=20),
paper_bgcolor="lightgrey",
width = 800,
);
fig.update_yaxes(type="linear")
fig.show();
interact(plot_cases_of_a_country, country='World')
ipywLayout = widgets.Layout(border='solid 2px green')
ipywLayout.display='none' # uncomment this, run cell again - then the graph/figure disappears
widgets.VBox([fig], layout=ipywLayout)
The summary below is differentially prviate statistics
# displaying the total stats
display(HTML("<div style = 'background-color: #d1cdcd; padding: 30px '>" +
"<span style='color: #5d73fc; font-size:30px;'> Confirmed: " + str(int(confirmed_total)) +"</span>" +
"<span # style='color: #f2493a; font-size:30px;margin-left:20px;'> Deaths: " + str(int(deaths_total))+ "</span>"+
"<span style='color: #2bba0f; font-size:30px; margin-left:20px;'> average incident_rate:" + str(int(incident_rate)) + "</span>"+
"<span style='color: #9dba0f; font-size:30px; margin-left:20px;'> average mortality_rate: " + str(round(mortality_rate, 2)) + "</span>"+
"</div>"))
To protect the identity of worst hit countries, we present the following data in differentiallty private setting. You can publicly pressent worst hit countries' covid statistics without revealing further information about those countries.
def perturb(column):
# 0 and 150 are the upper and lower limits for the search bound.
mu = dp.mean(column, epsilon=1, bounds=(1, 10))
sigma = dp.var(column, epsilon=1, bounds=(1, 10))
for i in range(len(column)):
noise = np.random.normal(mu, sigma)
column[i] = column[i]+noise
return column
#country_df['confirmed'] = perturb(country_df['confirmed'])
sorted_country_df = country_df.sort_values('confirmed', ascending= False)
sorted_country_df.loc['country'] = sorted_country_df['country'].astype(str).str[0]
# plotting the worst hit countries
def bubble_chart(TOP):
#for i in range(TOP):
# dp_max = private_max(1, country_df['confirmed'])
# sorted_country_df['confirmed'].iloc[i] = perturb(sorted_country_df['confirmed'].iloc[i])
#sorted_country_df.loc['confirmed']=perturb(sorted_country_df.loc['confirmed'])
fig = px.scatter(sorted_country_df.head(TOP), x="country", y="confirmed", size="confirmed", color="country",
hover_name="country", size_max=60)
fig.update_layout(
title=str(TOP) +" Worst Hit Countries",
xaxis_title="Countries",
yaxis_title="Confirmed Cases",
width = 700
)
fig.show();
interact(bubble_chart, TOP=10)
ipywLayout = widgets.Layout(border='solid 2px green')
ipywLayout.display='none'
widgets.VBox([fig], layout=ipywLayout)
We present an interactive DP global map on confirmed cases, deaths and death rates. Although the map reveals rough locations of each country, the statistics are differentially private and the name of each country is represented by a letter only. By doing so, this dashboard mamizes data privacy of each country while providing useful information for health organizationss and researchers.
You can click on the circles to view DP statistics of each country.
confirmed_df['lat'] = confirmed_df['lat'].replace(np.nan, 0)
confirmed_df['long'] = confirmed_df['long'].replace(np.nan, 0)
confirmed_df['country'] = confirmed_df['country'].astype(str).str[0]
world_map = folium.Map(location=[11,0], tiles="cartodbpositron", zoom_start=2, max_zoom = 6, min_zoom = 2)
for i in range(0,len(confirmed_df)):
folium.Circle(
location=[confirmed_df.iloc[i]['lat'], confirmed_df.iloc[i]['long']],
fill=True,
radius=(int((np.log(confirmed_df.iloc[i,-1]+1.00001)))+0.2)*5000,
color='red',
fill_color='indigo',
tooltip = "<div style='margin: 0; background-color: black; color: white;'>"+
"<h4 style='text-align:center;font-weight: bold'>"+confirmed_df.iloc[i]['country'] + "</h4>"
"<hr style='margin:10px;color: white;'>"+
"<ul style='color: white;;list-style-type:circle;align-item:left;padding-left:20px;padding-right:20px'>"+
"<li>Confirmed: "+str(dp_sum(confirmed_df.iloc[:,-1], 1))+"</li>"+
"<li>Deaths: "+str(dp_sum(confirmed_df.iloc[:,-1], 1))+"</li>"+
"<li>Death Rate: "+ str(np.round(death_df.iloc[i,-1]/(confirmed_df.iloc[i,-1]+1.00001)*100,2))+ "</li>"+
"</ul></div>",
).add_to(world_map)
world_map